001 /*********************************************************************************************** 002 * Tekijä: Jukka Salminen 003 * Opiskelijanumero: i79947 004 * Email: jukka.salminen@uwasa.fi 005 * Tekoaika: 23.9.2002 006 * Kurssi/vuosi: Ohjelmoinnin jatkokurssi/kevät 2002 007 * Työn tunnus: Harjoitustyö: Taulukkoeditori web-sivulle 008 *********************************************************************************************** 009 * QDParser-luokka 010 * =============== 011 * Lähde: www.javaworld.com 012 *http://www.javaworld.com/javaworld/javatips/jw-javatip128.html 013 * Quick and Dirty xml parser. This parser is, like the SAX parser, 014 * an event based parser, but with much less functionality. */ 015 package jsxml.qdxml; 016 import java.io.*; 017 import java.util.*; 018 019 public class QDParser 020 { 021 private static int popMode(Stack st) 022 { 023 if(!st.empty()) 024 return ((Integer)st.pop()).intValue(); 025 else 026 return PRE; 027 } 028 private final static int 029 TEXT = 1, 030 ENTITY = 2, 031 OPEN_TAG = 3, 032 CLOSE_TAG = 4, 033 START_TAG = 5, 034 ATTRIBUTE_LVALUE = 6, 035 ATTRIBUTE_EQUAL = 9, 036 ATTRIBUTE_RVALUE = 10, 037 QUOTE = 7, 038 IN_TAG = 8, 039 SINGLE_TAG = 12, 040 COMMENT = 13, 041 DONE = 11, 042 DOCTYPE = 14, 043 PRE = 15, 044 CDATA = 16; 045 public static void parse(DocHandler doc,Reader r) throws Exception 046 { 047 Stack st = new Stack(); 048 int depth = 0; 049 int mode = PRE; 050 int c = 0; 051 int quotec = '"'; 052 depth = 0; 053 StringBuffer sb = new StringBuffer(); 054 StringBuffer etag = new StringBuffer(); 055 String tagName = null; 056 String lvalue = null; 057 String rvalue = null; 058 Hashtable attrs = null; 059 st = new Stack(); 060 doc.startDocument(); 061 int line=1, col=0; 062 boolean eol = false; 063 String nextchar = new String(""); 064 while((c = r.read()) != -1) 065 { 066 067 // We need to map \r, \r\n, and \n to \n 068 // See XML spec section 2.11 069 if(c == '\n' && eol) 070 { 071 eol = false; 072 continue; 073 } else if(eol) 074 { 075 eol = false; 076 } else if(c == '\n') 077 { 078 line++; 079 col=0; 080 } else if(c == '\r') 081 { 082 eol = true; 083 c = '\n'; 084 line++; 085 col=0; 086 } else 087 { 088 col++; 089 } 090 091 if(mode == DONE) 092 { 093 doc.endDocument(); 094 return; 095 096 // We are between tags collecting text. 097 } else if(mode == TEXT) 098 { 099 if(c == '<') 100 { 101 st.push(new Integer(mode)); 102 mode = START_TAG; 103 if(sb.length() > 0) 104 { 105 doc.text(sb.toString()); 106 sb.setLength(0); 107 } 108 } else if(c == '&') 109 { 110 st.push(new Integer(mode)); 111 mode = ENTITY; 112 etag.setLength(0); 113 } else 114 sb.append((char)c); 115 116 // we are processing a closing tag: e.g. </foo> 117 } else if(mode == CLOSE_TAG) 118 { 119 if(c == '>') 120 { 121 mode = popMode(st); 122 tagName = sb.toString(); 123 sb.setLength(0); 124 depth--; 125 if(depth==0) 126 mode = DONE; 127 doc.endElement(tagName); 128 } else 129 { 130 sb.append((char)c); 131 } 132 133 // we are processing CDATA 134 } else if(mode == CDATA) 135 { 136 if(c == '>' 137 && sb.toString().endsWith("]]")) 138 { 139 sb.setLength(sb.length()-2); 140 doc.text(sb.toString()); 141 sb.setLength(0); 142 mode = popMode(st); 143 } else 144 sb.append((char)c); 145 146 // we are processing a comment. We are inside 147 // the <!-- .... --> looking for the -->. 148 } else if(mode == COMMENT) 149 { 150 if(c == '>' 151 && sb.toString().endsWith("--")) 152 { 153 sb.setLength(0); 154 mode = popMode(st); 155 } else 156 sb.append((char)c); 157 158 // We are outside the root tag element 159 } else if(mode == PRE) 160 { 161 if(c == '<') 162 { 163 mode = TEXT; 164 st.push(new Integer(mode)); 165 mode = START_TAG; 166 } 167 168 // We are inside one of these <? ... ?> 169 // or one of these <!DOCTYPE ... > 170 } else if(mode == DOCTYPE) 171 { 172 if(c == '>') 173 { 174 mode = popMode(st); 175 if(mode == TEXT) mode = PRE; 176 } 177 178 // we have just seen a < and 179 // are wondering what we are looking at 180 // <foo>, </foo>, <!-- ... --->, etc. 181 } else if(mode == START_TAG) 182 { 183 mode = popMode(st); 184 if(c == '/') 185 { 186 st.push(new Integer(mode)); 187 mode = CLOSE_TAG; 188 } else if (c == '?') 189 { 190 mode = DOCTYPE; 191 } else 192 { 193 st.push(new Integer(mode)); 194 mode = OPEN_TAG; 195 tagName = null; 196 attrs = new Hashtable(); 197 sb.append((char)c); 198 } 199 200 // we are processing an entity, e.g. <, », etc. 201 } else if(mode == ENTITY) 202 { 203 if(c == ';') 204 { 205 mode = popMode(st); 206 String cent = etag.toString(); 207 etag.setLength(0); 208 if(cent.equals("lt")) 209 sb.append('<'); 210 else if(cent.equals("gt")) 211 sb.append('>'); 212 else if(cent.equals("amp")) 213 sb.append('&'); 214 else if(cent.equals("quot")) 215 sb.append('"'); 216 else if(cent.equals("apos")) 217 sb.append('\''); 218 // Could parse hex entities if we wanted to 219 //else if(cent.startsWith("#x")) 220 //sb.append((char)Integer.parseInt(cent.substring(2),16)); 221 else if(cent.startsWith("#")) 222 sb.append((char)Integer.parseInt(cent.substring(1))); 223 // Insert custom entity definitions here 224 else 225 exc("Unknown entity: &"+cent+";",line,col); 226 } else 227 { 228 etag.append((char)c); 229 } 230 231 // we have just seen something like this: 232 // <foo a="b"/ 233 // and are looking for the final >. 234 } else if(mode == SINGLE_TAG) 235 { 236 if(tagName == null) 237 tagName = sb.toString(); 238 if(c != '>') 239 exc("Expected > for tag: <"+tagName+"/>",line,col); 240 doc.startElement(tagName,attrs); 241 doc.endElement(tagName); 242 if(depth==0) 243 { 244 doc.endDocument(); 245 return; 246 } 247 sb.setLength(0); 248 attrs = new Hashtable(); 249 tagName = null; 250 mode = popMode(st); 251 252 // we are processing something 253 // like this <foo ... >. It could 254 // still be a <!-- ... --> or something. 255 } else if(mode == OPEN_TAG) 256 { 257 if(c == '>') 258 { 259 if(tagName == null) 260 tagName = sb.toString(); 261 sb.setLength(0); 262 depth++; 263 doc.startElement(tagName,attrs); 264 tagName = null; 265 attrs = new Hashtable(); 266 mode = popMode(st); 267 } else if(c == '/') 268 { 269 mode = SINGLE_TAG; 270 } else if(c == '-' && sb.toString().equals("!-")) 271 { 272 mode = COMMENT; 273 } else if(c == '[' && sb.toString().equals("![CDATA")) 274 { 275 mode = CDATA; 276 sb.setLength(0); 277 } else if(c == 'E' && sb.toString().equals("!DOCTYP")) 278 { 279 sb.setLength(0); 280 mode = DOCTYPE; 281 } else if(Character.isWhitespace((char)c)) 282 { 283 tagName = sb.toString(); 284 sb.setLength(0); 285 mode = IN_TAG; 286 } else 287 { 288 sb.append((char)c); 289 } 290 291 // We are processing the quoted right-hand side 292 // of an element's attribute. 293 } else if(mode == QUOTE) 294 { 295 if(c == quotec) 296 { 297 rvalue = sb.toString(); 298 sb.setLength(0); 299 attrs.put(lvalue,rvalue); 300 mode = IN_TAG; 301 // See section the XML spec, section 3.3.3 302 // on normalization processing. 303 } else if(" \r\n\u0009".indexOf(c)>=0) 304 { 305 sb.append(' '); 306 } else if(c == '&') 307 { 308 st.push(new Integer(mode)); 309 mode = ENTITY; 310 etag.setLength(0); 311 } else 312 { 313 sb.append((char)c); 314 } 315 316 } else if(mode == ATTRIBUTE_RVALUE) 317 { 318 if(c == '"' || c == '\'') 319 { 320 quotec = c; 321 mode = QUOTE; 322 } else if(Character.isWhitespace((char)c)) 323 { 324 ; 325 } else 326 { 327 exc("Error in attribute processing",line,col); 328 } 329 330 } else if(mode == ATTRIBUTE_LVALUE) 331 { 332 if(Character.isWhitespace((char)c)) 333 { 334 lvalue = sb.toString(); 335 sb.setLength(0); 336 mode = ATTRIBUTE_EQUAL; 337 } else if(c == '=') 338 { 339 lvalue = sb.toString(); 340 sb.setLength(0); 341 mode = ATTRIBUTE_RVALUE; 342 } else 343 { 344 sb.append((char)c); 345 } 346 347 } else if(mode == ATTRIBUTE_EQUAL) 348 { 349 if(c == '=') 350 { 351 mode = ATTRIBUTE_RVALUE; 352 } else if(Character.isWhitespace((char)c)) 353 { 354 ; 355 } else 356 { 357 exc("Error in attribute processing.",line,col); 358 } 359 360 } else if(mode == IN_TAG) 361 { 362 if(c == '>') 363 { 364 mode = popMode(st); 365 doc.startElement(tagName,attrs); 366 depth++; 367 tagName = null; 368 attrs = new Hashtable(); 369 } else if(c == '/') 370 { 371 mode = SINGLE_TAG; 372 } else if(Character.isWhitespace((char)c)) 373 { 374 ; 375 } else 376 { 377 mode = ATTRIBUTE_LVALUE; 378 sb.append((char)c); 379 } 380 } 381 } 382 if(mode == DONE) 383 doc.endDocument(); 384 else 385 exc("missing end tag",line,col); 386 } 387 private static void exc(String s,int line,int col) 388 throws Exception 389 { 390 throw new Exception(s+" near line "+line+", column "+col); 391 } 392 }